In [96]:
!pip install git+http://git@github.com/kavgan/word_cloud.git
Collecting git+http://git@github.com/kavgan/word_cloud.git
  Cloning http://git@github.com/kavgan/word_cloud.git to /tmp/pip-req-build-k7v6wcds
Requirement already satisfied: scikit-learn==0.19.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from word-cloud==0.0.0) (0.19.1)
Collecting pandas==0.20.3 (from word-cloud==0.0.0)
  Downloading https://files.pythonhosted.org/packages/fe/6f/5733658857dffb998afa2120027171c263384ada0487a969e5ecd5bf9ac9/pandas-0.20.3-cp36-cp36m-manylinux1_x86_64.whl (24.5MB)
    100% |████████████████████████████████| 24.5MB 2.1MB/s 
Requirement already satisfied: python-dateutil>=2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pandas==0.20.3->word-cloud==0.0.0) (2.7.3)
Requirement already satisfied: numpy>=1.7.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pandas==0.20.3->word-cloud==0.0.0) (1.14.5)
Requirement already satisfied: pytz>=2011k in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pandas==0.20.3->word-cloud==0.0.0) (2018.4)
Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from python-dateutil>=2->pandas==0.20.3->word-cloud==0.0.0) (1.11.0)
Building wheels for collected packages: word-cloud
  Running setup.py bdist_wheel for word-cloud ... done
  Stored in directory: /tmp/pip-ephem-wheel-cache-rmpgi0nd/wheels/d5/72/19/d85b5e30feb40c15fd35e4b5819b75fbfa14e28696989170cb
Successfully built word-cloud
Installing collected packages: pandas, word-cloud
  Found existing installation: pandas 0.22.0
    Uninstalling pandas-0.22.0:
      Successfully uninstalled pandas-0.22.0
Successfully installed pandas-0.20.3 word-cloud-0.0.0
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [92]:
 
/bin/sh: y: command not found
In [1]:
!pip install wordcloud
Collecting wordcloud
  Downloading https://files.pythonhosted.org/packages/ae/af/849edf14d573eba9c8082db898ff0d090428d9485371cc4fe21a66717ad2/wordcloud-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (361kB)
    100% |████████████████████████████████| 368kB 43.7MB/s 
Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (5.2.0)
Requirement already satisfied: numpy>=1.6.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (1.14.5)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.5.0
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [2]:
!pip install gensim
Collecting gensim
  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
    100% |████████████████████████████████| 23.6MB 2.1MB/s 
Requirement already satisfied: scipy>=0.18.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from gensim) (1.1.0)
Requirement already satisfied: numpy>=1.11.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from gensim) (1.14.5)
Requirement already satisfied: six>=1.5.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from gensim) (1.11.0)
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Requirement already satisfied: boto>=2.32 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from smart-open>=1.2.1->gensim) (2.48.0)
Collecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Requirement already satisfied: requests in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from smart-open>=1.2.1->gensim) (2.18.4)
Requirement already satisfied: boto3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from smart-open>=1.2.1->gensim) (1.9.47)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests->smart-open>=1.2.1->gensim) (3.0.4)
Requirement already satisfied: idna<2.7,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests->smart-open>=1.2.1->gensim) (2.6)
Requirement already satisfied: urllib3<1.23,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests->smart-open>=1.2.1->gensim) (1.22)
Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests->smart-open>=1.2.1->gensim) (2018.8.24)
Requirement already satisfied: botocore<1.13.0,>=1.12.47 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3->smart-open>=1.2.1->gensim) (1.12.47)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3->smart-open>=1.2.1->gensim) (0.9.3)
Requirement already satisfied: s3transfer<0.2.0,>=0.1.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from boto3->smart-open>=1.2.1->gensim) (0.1.13)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= "2.7" in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore<1.13.0,>=1.12.47->boto3->smart-open>=1.2.1->gensim) (2.7.3)
Requirement already satisfied: docutils>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from botocore<1.13.0,>=1.12.47->boto3->smart-open>=1.2.1->gensim) (0.14)
Building wheels for collected packages: smart-open, bz2file
  Running setup.py bdist_wheel for smart-open ... done
  Stored in directory: /home/ec2-user/.cache/pip/wheels/23/00/44/e5b939f7a80c04e32297dbd6d96fa3065af89ecf57e2b5f89f
  Running setup.py bdist_wheel for bz2file ... done
  Stored in directory: /home/ec2-user/.cache/pip/wheels/81/75/d6/e1317bf09bf1af5a30befc2a007869fa6e1f516b8f7c591cb9
Successfully built smart-open bz2file
Installing collected packages: bz2file, smart-open, gensim
Successfully installed bz2file-0.98 gensim-3.6.0 smart-open-1.7.1
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [4]:
!pip install stop_words
Collecting stop_words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Running setup.py bdist_wheel for stop-words ... done
  Stored in directory: /home/ec2-user/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [5]:
import numpy as np
import io
import pandas as pd
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import PIL.ImageOps
import random
from wordcloud import ImageColorGenerator

%matplotlib inline
%config InlineBackend.figure_format='retina'
In [10]:
df = pd.read_csv('small_700_through_710_descr_clm_code.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df['descr_clm'] = df.descr + df.clm
df.drop(['descr','clm'],axis=1, inplace=True)
df['code'] = df['code'].astype('category')
df.head()
Out[10]:
code descr_clm
0 700 This application claims priority under 35 U.S....
1 700 BACKGROUND \n 1. Field of Invention \n ...
2 700 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
3 700 FIELD OF THE INVENTION \n The present inve...
4 700 RELATED APPLICATION \n This application cl...

Word Cloud for 705

In [11]:
df_705 = df[df['code']==705]
In [12]:
df_705.head()
Out[12]:
code descr_clm
522 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
523 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
524 705 BACKGROUND OF THE INVENTION \n This invent...
525 705 CROSS REFERENCE TO RELATED APPLICATIONS \n ...
526 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
In [73]:
from wordcloud import STOPWORDS
In [83]:
custom_stopword_list = ['using','said','based','wherein','comprising','comprise','diagram','system','process','method','one', 'may','claim','embodiment','invention','include', 'example', 'include','step','figure','fig']
In [84]:
stopwords = STOPWORDS
stopwords |= set(custom_stopword_list)
In [85]:
text = df_705.descr_clm.values

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stopwords).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Word Cloud for 706

In [80]:
df_706 = df[df['code']==706]
In [81]:
df_706.head()
Out[81]:
code descr_clm
592 706 TECHNICAL FIELD \n The present technique r...
593 706 The present invention relates to an apparatus ...
594 706 CROSS REFERENCE TO OTHER APPLICATIONS \n T...
595 706 GOVERNMENT INTEREST \n The present inventi...
1930 706 CROSS-REFERENCE TO RELATED TO RELATED APPLICAT...
In [106]:
text_706 = df_706.descr_clm.values

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stopwords).generate(str(text_706))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-106-3d6e8e72a101> in <module>()
      3 wordcloud = WordCloud(
      4     use_tfidf = True,
----> 5     stopwords = stopwords).generate(str(text_706))
      6 fig = plt.figure(
      7     figsize = (40, 30),

AttributeError: 'WordCloud' object has no attribute 'generate'
In [100]:
from word_cloud.word_cloud_generator import WordCloud
from IPython.core.display import HTML
wc=WordCloud(use_tfidf=True,stopwords=stopwords)
text = sort(text)
#don't randomize color, show only top 500
embed_code=wc.get_embed_code(text=text,random_color=True,topn=500)
HTML(embed_code)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-100-28f74b3446c2> in <module>()
      2 from IPython.core.display import HTML
      3 wc=WordCloud(use_tfidf=True,stopwords=stopwords)
----> 4 text = sort(text)
      5 #don't randomize color, show only top 500
      6 embed_code=wc.get_embed_code(text=text,random_color=True,topn=500)

NameError: name 'sort' is not defined

topic modeling

In [108]:
!pip install pyLDAvis
Collecting pyLDAvis
  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
    100% |████████████████████████████████| 1.6MB 23.0MB/s 
Requirement already satisfied: wheel>=0.23.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (0.31.1)
Requirement already satisfied: numpy>=1.9.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (1.14.5)
Requirement already satisfied: scipy>=0.18.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (1.1.0)
Requirement already satisfied: pandas>=0.17.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (0.20.3)
Collecting joblib>=0.8.4 (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/0d/1b/995167f6c66848d4eb7eabc386aebe07a1571b397629b2eac3b7bebdc343/joblib-0.13.0-py2.py3-none-any.whl (276kB)
    100% |████████████████████████████████| 276kB 46.2MB/s 
Requirement already satisfied: jinja2>=2.7.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (2.10)
Requirement already satisfied: numexpr in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (2.6.5)
Requirement already satisfied: pytest in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyLDAvis) (3.5.1)
Collecting future (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/90/52/e20466b85000a181e1e144fd8305caf2cf475e2f9674e797b222f8105f5f/future-0.17.1.tar.gz (829kB)
    100% |████████████████████████████████| 829kB 32.8MB/s 
Collecting funcy (from pyLDAvis)
  Downloading https://files.pythonhosted.org/packages/47/a4/204fa23012e913839c2da4514b92f17da82bf5fc8c2c3d902fa3fa3c6eec/funcy-1.11-py2.py3-none-any.whl
Requirement already satisfied: python-dateutil>=2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pandas>=0.17.0->pyLDAvis) (2.7.3)
Requirement already satisfied: pytz>=2011k in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pandas>=0.17.0->pyLDAvis) (2018.4)
Requirement already satisfied: MarkupSafe>=0.23 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from jinja2>=2.7.2->pyLDAvis) (1.0)
Requirement already satisfied: py>=1.5.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (1.5.3)
Requirement already satisfied: six>=1.10.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (1.11.0)
Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (39.1.0)
Requirement already satisfied: attrs>=17.4.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (18.1.0)
Requirement already satisfied: more-itertools>=4.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (4.1.0)
Requirement already satisfied: pluggy<0.7,>=0.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pytest->pyLDAvis) (0.6.0)
Building wheels for collected packages: pyLDAvis, future
  Running setup.py bdist_wheel for pyLDAvis ... done
  Stored in directory: /home/ec2-user/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Running setup.py bdist_wheel for future ... done
  Stored in directory: /home/ec2-user/.cache/pip/wheels/0c/61/d2/d6b7317325828fbb39ee6ad559dbe4664d0896da4721bf379e
Successfully built pyLDAvis future
Installing collected packages: joblib, future, funcy, pyLDAvis
Successfully installed funcy-1.11 future-0.17.1 joblib-0.13.0 pyLDAvis-2.1.2
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [109]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
In [110]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [135]:
df_705_706 = df[(df['code'] == 705)|(df['code'] == 706)]
In [136]:
df_705_706['code'].unique()
Out[136]:
[705, 706]
Categories (2, int64): [705, 706]
In [144]:
df_705_706['code'].value_counts()
Out[144]:
706    1000
705    1000
710       0
709       0
708       0
707       0
704       0
703       0
702       0
701       0
700       0
Name: code, dtype: int64
In [160]:
tf_vectorizer_705_706 = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stopwords,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf_705_706 = tf_vectorizer_705_706.fit_transform(df_705_706.descr_clm.values)
print(dtm_tf_705_706.shape)
(2000, 7408)
In [161]:
tfidf_vectorizer_705_706 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_705_706 = tfidf_vectorizer_705_706.fit_transform(df_705_706.descr_clm.values)
print(dtm_tfidf_705_706.shape)
(2000, 7284)
In [162]:
# for TF DTM
lda_tf_705_706 = LatentDirichletAllocation(n_topics=2, random_state=0)
lda_tf_705_706.fit(dtm_tf_705_706)
# for TFIDF DTM
lda_tfidf_705_706 = LatentDirichletAllocation(n_topics=2, random_state=0)
lda_tfidf_705_706.fit(dtm_tfidf_705_706)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
Out[162]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=2,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
In [163]:
pyLDAvis.sklearn.prepare(lda_tf_705_706, dtm_tf_705_706, tf_vectorizer_705_706)
Out[163]:
In [164]:
pyLDAvis.sklearn.prepare(lda_tfidf_705_706, dtm_tfidf_705_706, tfidf_vectorizer_705_706)
Out[164]:

Topic Modeling 705

In [165]:
tf_vectorizer_705 = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stopwords,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf_705 = tf_vectorizer_705.fit_transform(df_705.descr_clm.values)
print(dtm_tf_705.shape)
(1000, 5228)
In [166]:
tfidf_vectorizer_705 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_705 = tfidf_vectorizer_705.fit_transform(df_705.descr_clm.values)
print(dtm_tfidf_705.shape)
(1000, 5113)
In [171]:
# for TF DTM
lda_tf_705 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf.fit(dtm_tf_705)
# for TFIDF DTM
lda_tfidf_705 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf_705.fit(dtm_tfidf_705)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
Out[171]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
In [172]:
pyLDAvis.sklearn.prepare(lda_tf_705, dtm_tf_705, tf_vectorizer_705)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-172-640c8ddc94b1> in <module>()
----> 1 pyLDAvis.sklearn.prepare(lda_tf_705, dtm_tf_705, tf_vectorizer_705)

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyLDAvis/sklearn.py in prepare(lda_model, dtm, vectorizer, **kwargs)
     92     See `pyLDAvis.prepare` for **kwargs.
     93     """
---> 94     opts = fp.merge(_extract_data(lda_model, dtm, vectorizer), kwargs)
     95     return pyLDAvis.prepare(**opts)

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyLDAvis/sklearn.py in _extract_data(lda_model, dtm, vectorizer)
     39     doc_lengths = _get_doc_lengths(dtm)
     40     term_freqs = _get_term_freqs(dtm)
---> 41     topic_term_dists = _get_topic_term_dists(lda_model)
     42 
     43     assert term_freqs.shape[0] == len(vocab), \

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyLDAvis/sklearn.py in _get_topic_term_dists(lda_model)
     32 
     33 def _get_topic_term_dists(lda_model):
---> 34     return _row_norm(lda_model.components_)
     35 
     36 

AttributeError: 'LatentDirichletAllocation' object has no attribute 'components_'
In [173]:
pyLDAvis.sklearn.prepare(lda_tfidf_705, dtm_tfidf_705, tfidf_vectorizer_705)
Out[173]:

Topic Modelling 706

In [174]:
tf_vectorizer_706 = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stopwords,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf_706 = tf_vectorizer_706.fit_transform(df_706.descr_clm.values)
print(dtm_tf_706.shape)
(1000, 5083)
In [175]:
tfidf_vectorizer_706 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_706 = tfidf_vectorizer_706.fit_transform(df_706.descr_clm.values)
print(dtm_tfidf_706.shape)
(1000, 4970)
In [176]:
# for TF DTM
lda_tf_706 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf_706.fit(dtm_tf_706)
# for TFIDF DTM
lda_tfidf_706 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf_706.fit(dtm_tfidf_706)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
  DeprecationWarning)
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
Out[176]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
In [177]:
pyLDAvis.sklearn.prepare(lda_tf_706, dtm_tf_706, tf_vectorizer_706)
Out[177]:
In [178]:
pyLDAvis.sklearn.prepare(lda_tfidf_706, dtm_tfidf_706, tfidf_vectorizer_706)
Out[178]:
In [ ]: